Main Figures
Have a legend for all plots Since there’s a set parameter in set_colors.R file that was sourced at the beginning, we can use this throughout to can assure that the below legends will represent the legends called afterwards.
####### Legend for season
legend_plot <- ggplot(metadata, aes(y = frac_bacprod, x = fraction, fill = fraction, shape = season)) +
geom_jitter(size = 3, width = 0.2) +
scale_fill_manual(values = fraction_colors, guide = guide_legend(override.aes = list(shape = 22, size = 4))) +
scale_shape_manual(values = season_shapes, guide = guide_legend(override.aes = list(size = 4))) +
theme(legend.position = "bottom", axis.title.x = element_blank(),
legend.title = element_blank(), legend.justification="center",
plot.margin = unit(c(0.2,0.2,1,0.2), "cm")) # top, right, bottom, and left margins
# Extract the legend
season_legend <- cowplot::get_legend(legend_plot + theme(legend.direction = "horizontal",legend.justification="center" ,legend.box.just = "bottom"))
### Make it 2 rows for plots that are skinnier
####### Legend for season
season_2row_plot <- ggplot(metadata, aes(y = frac_bacprod, x = fraction, fill = fraction, shape = season)) +
geom_jitter(size = 3, width = 0.2) +
scale_fill_manual(values = fraction_colors, guide = guide_legend(override.aes = list(shape = 22, size = 4))) +
scale_shape_manual(values = season_shapes, guide = guide_legend(override.aes = list(size = 4))) +
theme(legend.position = "bottom", legend.justification="center",
axis.title.x = element_blank(),
legend.title = element_blank(), legend.box = "vertical",
legend.spacing.y = unit(0.1, 'cm'),
plot.margin = unit(c(0.2,0.2,1,0.2), "cm")) # top, right, bottom, and left margins
# Extract the legend
season_2row_legend <- cowplot::get_legend(season_2row_plot + theme(legend.direction = "horizontal",legend.justification="center" ,legend.box.just = "bottom"))
####### Legend for lakesite
lakesite_legend <- ggplot(metadata, aes(y = frac_bacprod, x = fraction, fill = fraction, shape = lakesite)) +
geom_jitter(size = 3, width = 0.2) +
scale_fill_manual(values = fraction_colors, guide = guide_legend(override.aes = list(shape = 22, size = 4))) +
scale_shape_manual(values = lakesite_shapes, guide = guide_legend(override.aes = list(size = 4))) +
theme(legend.position = "bottom", axis.title.x = element_blank(),
legend.title = element_blank(), legend.justification="center",
plot.margin = unit(c(0.2,0.2,1,0.2), "cm")) # top, right, bottom, and left margins
# Extract the legend
lakesite_legend <- cowplot::get_legend(lakesite_legend + theme(legend.direction = "horizontal",legend.justification="center" ,legend.box.just = "bottom"))
Figure 1
######################################################### Fraction ABUNDANCe
frac_abund_wilcox <- wilcox.test(log10(as.numeric(fraction_bac_abund)) ~ fraction, data = metadata)
frac_abund_wilcox
##
## Wilcoxon rank sum test
##
## data: log10(as.numeric(fraction_bac_abund)) by fraction
## W = 0, p-value = 1.479e-06
## alternative hypothesis: true location shift is not equal to 0
metadata %>%
group_by(fraction) %>%
summarize(mean(as.numeric(fraction_bac_abund)))
## # A tibble: 2 x 2
## fraction `mean(as.numeric(fraction_bac_abund))`
## <fct> <dbl>
## 1 Particle 41169.
## 2 Free 734522.
# Make a data frame to draw significance line in boxplot (visually calculated)
dat1 <- data.frame(a = c(1.1,1.1,1.9,1.9), b = c(6.45,6.5,6.5,6.45)) # WholePart vs WholeFree
poster_a <-
ggplot(filter(metadata, norep_filter_name != "MOTEJ515"),
aes(y = log10(as.numeric(fraction_bac_abund)), x = fraction)) +
geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) +
geom_boxplot(alpha = 0.5, outlier.shape = NA, aes( fill = fraction)) +
ylab("Log10(Bacterial Counts) \n (cells/mL)") +
scale_fill_manual(values = fraction_colors) +
scale_shape_manual(values = season_shapes) +
##### Particle vs free cell abundances
geom_path(data = dat1, aes(x = a, y = b), linetype = 1, color = "gray40") +
annotate("text", x=1.5, y=6.55, size = 8, color = "gray40", label= paste("***")) +
annotate("text", x=1.5, y=6.4, size = 3.5, color = "gray40",
label= paste("p =", round(frac_abund_wilcox$p.value, digits = 6))) +
theme(legend.position = "bottom",
legend.title = element_blank(),
axis.title.x = element_blank())
######################################################### TOTAL PRODUCTION
totprod_wilcox <- wilcox.test(frac_bacprod ~ fraction, data = metadata)
totprod_wilcox
##
## Wilcoxon rank sum test
##
## data: frac_bacprod by fraction
## W = 33, p-value = 0.02418
## alternative hypothesis: true location shift is not equal to 0
metadata %>%
group_by(fraction) %>%
summarize(mean(frac_bacprod))
## # A tibble: 2 x 2
## fraction `mean(frac_bacprod)`
## <fct> <dbl>
## 1 Particle 9.96
## 2 Free 24.1
# Make a data frame to draw significance line in boxplot (visually calculated)
dat2 <- data.frame(a = c(1.1,1.1,1.9,1.9), b = c(71,72,72,71)) # WholePart vs WholeFree
poster_b <- ggplot(metadata, aes(y = frac_bacprod, x = fraction)) +
geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) +
geom_boxplot(alpha = 0.5, outlier.shape = NA, aes( fill = fraction)) +
scale_fill_manual(values = fraction_colors, guide = FALSE) +
scale_shape_manual(values = season_shapes) +
scale_y_continuous(limits = c(0, 78), expand = c(0,0), breaks = seq(0, 75, by = 15)) +
ylab("Community Production \n (μgC/L/day)") +
##### Particle vs free bulk production
geom_path(data = dat2, aes(x = a, y = b), linetype = 1, color = "gray40") +
annotate("text", x=1.5, y=73, size = 8, color = "gray40", label= paste("*")) +
annotate("text", x=1.5, y=69, size = 3.5, color = "gray40",
label= paste("p =", round(totprod_wilcox$p.value, digits = 3))) +
theme(legend.position = "none",
axis.title.x = element_blank())
######################################################### TOTAL PRODUCTION
percellprod_wilcox <- wilcox.test(log10(fracprod_per_cell) ~ fraction, data = filter(metadata, norep_filter_name != "MOTEJ515"))
percellprod_wilcox
##
## Wilcoxon rank sum test
##
## data: log10(fracprod_per_cell) by fraction
## W = 125, p-value = 6.656e-05
## alternative hypothesis: true location shift is not equal to 0
filter(metadata, norep_filter_name != "MOTEJ515") %>%
group_by(fraction) %>%
summarize(mean(fracprod_per_cell))
## # A tibble: 2 x 2
## fraction `mean(fracprod_per_cell)`
## <fct> <dbl>
## 1 Particle 0.000000482
## 2 Free 0.0000000387
# Make a data frame to draw significance line in boxplot (visually calculated)
dat3 <- data.frame(a = c(1.1,1.1,1.9,1.9), b = c(-5.05,-5,-5,-5.05)) # WholePart vs WholeFree
poster_c <- ggplot(filter(metadata, norep_filter_name != "MOTEJ515"),
aes(y = log10(fracprod_per_cell), x = fraction)) +
geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) +
geom_boxplot(alpha = 0.5, outlier.shape = NA, aes( fill = fraction)) +
scale_fill_manual(values = fraction_colors, guide = FALSE) +
scale_shape_manual(values = season_shapes) +
ylim(c(-8.5, -4.9)) +
ylab("Log10(Per-Capita Production) \n(μgC/cell/day)") +
##### Particle vs free per-cell production
geom_path(data = dat3, aes(x = a, y = b), linetype = 1, color = "gray40") +
annotate("text", x=1.5, y=-4.95, size = 8, color = "gray40", label= paste("***")) +
annotate("text", x=1.5, y=-5.15, size = 3.5, color = "gray40",
label= paste("p =", round(percellprod_wilcox$p.value, digits = 5))) +
theme(legend.position = "none",
axis.title.x = element_blank())
######## FIGURE 1
row1_plots <- plot_grid(poster_a + theme(legend.position = "none"), poster_b, poster_c,
labels = c("A", "B", "C"),
ncol = 3, nrow = 1)
fig_1 <- plot_grid(row1_plots, season_legend,
ncol = 1, nrow = 2,
rel_heights = c(1, 0.05)); fig_1
# What are the averages?
metadata %>%
group_by(fraction) %>%
dplyr::select(frac_bacprod, fracprod_per_cell_noinf) %>%
na.omit() %>%
summarize(avg_comm_prod = mean(frac_bacprod),
avg_percell_prod = mean(fracprod_per_cell_noinf),
log10_avg_percell_prod = log10(avg_percell_prod))
## # A tibble: 2 x 4
## fraction avg_comm_prod avg_percell_prod log10_avg_percell_prod
## <fct> <dbl> <dbl> <dbl>
## 1 Particle 9.83 0.000000482 -6.32
## 2 Free 24.1 0.0000000387 -7.41
Figure 2
site_div_order <- c("richness","shannon","simpson","unweighted_sesmpd",
"phylo_richness", "phylo_shannon","phylo_simpson", "weighted_sesmpd")
site_div_labs <- c("{}^0*italic(D)", "{}^1*italic(D)","{}^2*italic(D)", "italic(Unweighted_MPD)",
"{}^0*italic(PD)", "{}^1*italic(PD)","{}^2*italic(PD)", "italic(Weighted_MPD)")
long_div_meta_df <-
wide_div_meta_df %>%
dplyr::select(c(norep_filter_name, richness:weighted_sesmpd, fraction, lakesite, season)) %>%
gather(key = Diversity, value = div_val, richness:weighted_sesmpd) %>%
mutate(Diversity = fct_relevel(Diversity, levels = site_div_order),
hill_labels = factor(Diversity, labels = site_div_labs))
wc_pafl_pvals1 <- c(wilcox.test(richness ~ fraction, data = wide_div_meta_df)$p.value,
wilcox.test(shannon ~ fraction, data = wide_div_meta_df)$p.value,
wilcox.test(simpson ~ fraction, data = wide_div_meta_df)$p.value,
wilcox.test(unweighted_sesmpd ~ fraction, data = wide_div_meta_df)$p.value,
# Row 2
wilcox.test(phylo_richness ~ fraction, data = wide_div_meta_df)$p.value,
wilcox.test(phylo_shannon ~ fraction, data = wide_div_meta_df)$p.value,
wilcox.test(phylo_simpson ~ fraction, data = wide_div_meta_df)$p.value,
wilcox.test(weighted_sesmpd ~ fraction, data = wide_div_meta_df)$p.value)
pafl_pvals_adjusted <- p.adjust(wc_pafl_pvals1, method = "fdr")
wc_pafl_pvals <- c(paste("p=", round(pafl_pvals_adjusted[1], digits = 3), sep = ""),
paste("p=", round(pafl_pvals_adjusted[2], digits = 3), sep = ""),
paste("N", "S", sep = ""),
paste("p=", round(pafl_pvals_adjusted[4], digits = 2), sep = ""),
# Row 2
paste("p=", round(pafl_pvals_adjusted[5], digits = 3),sep = ""),
paste("p=", round(pafl_pvals_adjusted[6], digits = 3), sep = ""),
paste("p=", round(pafl_pvals_adjusted[7], digits = 3), sep = ""),
paste("N", "S", sep = ""))
fraction <- as.character(rep("Free", 8))
wide_div_meta_df %>%
filter(fraction == "Free") %>%
summarize(max_rich = max(richness), max_shan = max(shannon), max_simps = max(simpson), max_umpd = max(unweighted_sesmpd),
# row 2
max_phyrich = max(phylo_richness), max_physhan = max(phylo_shannon), max_physimps = max(phylo_simpson), max_wmpd = max(weighted_sesmpd))
## max_rich max_shan max_simps max_umpd max_phyrich max_physhan max_physimps max_wmpd
## 1 767 101.572 43.426 1.000442 107.8883 8.437559 4.054059 -0.0006459971
# Locations of labels
maxes <- c(1350, 299, 78, 1.35, 150, 15.2, 5.8, 0.1)
df_temp <- data.frame(hill_labels = site_div_labs, wc_pafl_pvals, fraction, maxes)
long_div_meta_df %>%
# left_join(df_temp, by = "Diversity") %>%
ggplot(aes(y = div_val, x = fraction, fill = fraction)) +
ylab("Diversity Value By Fraction") +
facet_wrap(hill_labels~., scales = "free", labeller = label_parsed, nrow = 2, ncol = 4) +
geom_point(size = 3, position = position_jitterdodge(), aes(shape = lakesite)) +
geom_boxplot(alpha = 0.5, outlier.shape = NA, color = "black") +
scale_fill_manual(values = fraction_colors) +
scale_shape_manual(values = lakesite_shapes) +
scale_color_manual(values = fraction_colors) +
theme(legend.position = "none", axis.title.x = element_blank()) +
geom_text(data = df_temp, aes(x = fraction, y = maxes, label = wc_pafl_pvals), size = 4.5, hjust = 0.5,color = "grey40")
Linear Models
Prepare data frames for table S1 and S2
## Prepare the dataframes for Free and Particle data
metadata_pca_PD_free <- dplyr::filter(wide_div_meta_df, fraction == "Free")
metadata_pca_PD_part <- dplyr::filter(wide_div_meta_df, fraction == "Particle")
# List of environmental variables to run linear models on
# BGA_cellspermL and Turb_NTU do not have enough data points for regression
lm_variables <- c("frac_bacprod", "fracprod_per_cell_noinf", "Temp_C", "SpCond_uSpercm", "TDS_mgperL",
"pH", "ORP_mV", "Chl_Lab_ugperL", "Cl_mgperL", "SO4_mgperL", "NO3_mgperL",
"NH3_mgperL", "TKN_mgperL", "SRP_ugperL", "TP_ugperL", "Alk_mgperL", "DO_mgperL",
"total_bac_abund","attached_bac", "dnaconcrep1",
"PC1", "PC2",
"richness", "shannon", "simpson", "phylo_richness", "phylo_shannon", "phylo_simpson",
"unweighted_sesmpd", "weighted_sesmpd")
########################################################
############## Particle: Community-wide
lm_summary_particle_comm <- metadata_pca_PD_part %>%
dplyr::select(one_of(lm_variables), -fracprod_per_cell_noinf) %>%
gather(key = independent, value = measurement, -frac_bacprod) %>%
mutate(measurement = as.numeric(measurement)) %>%
group_by(independent) %>%
do(glance(lm(frac_bacprod ~ measurement, data = .))) %>%
mutate(fraction = "Particle", dependent = "Community Production") %>%
dplyr::select(fraction, dependent, independent, logLik, AIC, adj.r.squared, p.value) %>%
ungroup() %>%
mutate(FDR.p = p.adjust(p.value, method = "fdr"))
############## Particle: Per-capita
lm_summary_particle_percap <- metadata_pca_PD_part %>%
dplyr::select(one_of(lm_variables), -frac_bacprod) %>%
filter(!is.na(fracprod_per_cell_noinf)) %>% # Remove samples that are not NA
gather(key = independent, value = measurement, -fracprod_per_cell_noinf) %>%
mutate(measurement = as.numeric(measurement)) %>%
group_by(independent) %>%
do(glance(lm(log10(fracprod_per_cell_noinf) ~ measurement, data = .))) %>%
mutate(fraction = "Particle", dependent = "Per-Capita Production") %>%
dplyr::select(fraction, dependent, independent, logLik, AIC, adj.r.squared, p.value) %>%
ungroup() %>%
mutate(FDR.p = p.adjust(p.value, method = "fdr"))
########################################################
############## Free: Community-wide
lm_summary_free_comm <- metadata_pca_PD_free %>%
dplyr::select(one_of(lm_variables), -fracprod_per_cell_noinf) %>%
gather(key = independent, value = measurement, -frac_bacprod) %>%
mutate(measurement = as.numeric(measurement)) %>%
group_by(independent) %>%
do(glance(lm(frac_bacprod ~ measurement, data = .))) %>%
mutate(fraction = "Free", dependent = "Community Production") %>%
dplyr::select(fraction, dependent, independent, logLik, AIC, adj.r.squared, p.value) %>%
ungroup() %>%
mutate(FDR.p = p.adjust(p.value, method = "fdr"))
############## Free: Per-capita
lm_summary_free_percap <- metadata_pca_PD_free %>%
dplyr::select(one_of(lm_variables), -frac_bacprod) %>%
filter(!is.na(fracprod_per_cell_noinf)) %>% # Remove samples that are not NA
gather(key = independent, value = measurement, -fracprod_per_cell_noinf) %>%
mutate(measurement = as.numeric(measurement)) %>%
group_by(independent) %>%
do(glance(lm(log10(fracprod_per_cell_noinf) ~ measurement, data = .))) %>%
mutate(fraction = "Free", dependent = "Per-Capita Production") %>%
dplyr::select(fraction, dependent, independent, logLik, AIC, adj.r.squared, p.value) %>%
ungroup() %>%
mutate(FDR.p = p.adjust(p.value, method = "fdr"))
## Filtered PCA Linear Model Results
# Put all of the PCA and environmental linear model results together into one dataframe
all_pca_div_environ_lm_results <-
bind_rows(lm_summary_particle_comm, lm_summary_particle_percap,
lm_summary_free_comm, lm_summary_free_percap) %>%
dplyr::rename(independent_var = independent,
dependent_var = dependent)
## Combine the results with the diversity linear models
individual_lm_results <- all_pca_div_environ_lm_results %>%
mutate(AIC = round(AIC, digits = 2),
adj.r.squared = round(adj.r.squared, digits = 2),
p.value = round(p.value, digits = 4),
FDR.p = round(FDR.p, digits = 4)) %>%
rename(FDR.p.value = FDR.p)
# Free-Living samples only
div_measures_free <- comb_div_df %>%
dplyr::select(fraction, Observed, Diversity, frac_bacprod, fracprod_per_cell_noinf) %>%
filter(fraction == "Free") %>%
dplyr::select(-fraction)
# Particle-associated samples only
div_measures_part <- comb_div_df %>%
dplyr::select(fraction, Observed, Diversity, frac_bacprod, fracprod_per_cell_noinf) %>%
filter(fraction == "Particle") %>%
dplyr::select(-fraction)
## All of the samples!
div_measures_all <- comb_div_df %>%
dplyr::select(Observed, Diversity, frac_bacprod, fracprod_per_cell_noinf)
########################################################
############## Particle: Community-wide
lm_divs_particle_comm <- div_measures_part %>%
dplyr::select(-fracprod_per_cell_noinf) %>%
group_by(Diversity) %>%
do(glance(lm(frac_bacprod ~ Observed, data = .))) %>%
mutate(fraction = "Particle", dependent = "Community Production") %>%
dplyr::select(fraction, dependent, Diversity, logLik, AIC, adj.r.squared, p.value) %>%
ungroup() %>%
mutate(FDR.p = p.adjust(p.value, method = "fdr"))
############## Particle: Community-wide
lm_divs_particle_percap <- div_measures_part %>%
dplyr::select(-frac_bacprod) %>%
group_by(Diversity) %>%
do(glance(lm(log10(fracprod_per_cell_noinf) ~ Observed, data = .))) %>%
mutate(fraction = "Particle", dependent = "Per-Capita Production") %>%
dplyr::select(fraction, dependent, Diversity, logLik, AIC, adj.r.squared, p.value) %>%
ungroup() %>%
mutate(FDR.p = p.adjust(p.value, method = "fdr"))
########################################################
############## Free: Community-wide
lm_divs_free_comm <- div_measures_free %>%
dplyr::select(-fracprod_per_cell_noinf) %>%
group_by(Diversity) %>%
do(glance(lm(frac_bacprod ~ Observed, data = .))) %>%
mutate(fraction = "Free", dependent = "Community Production") %>%
dplyr::select(fraction, dependent, Diversity, logLik, AIC, adj.r.squared, p.value) %>%
ungroup() %>%
mutate(FDR.p = p.adjust(p.value, method = "fdr"))
############## Free: Community-wide
lm_divs_free_percap <- div_measures_free %>%
dplyr::select(-frac_bacprod) %>%
group_by(Diversity) %>%
do(glance(lm(log10(fracprod_per_cell_noinf) ~ Observed, data = .))) %>%
mutate(fraction = "Free", dependent = "Per-Capita Production") %>%
dplyr::select(fraction, dependent, Diversity, logLik, AIC, adj.r.squared, p.value) %>%
ungroup() %>%
mutate(FDR.p = p.adjust(p.value, method = "fdr"))
# Community-Wide Production
table_comm_wide <- bind_rows(lm_divs_particle_comm, lm_divs_free_comm) %>%
dplyr::select(-dependent) %>%
mutate(AIC = round(AIC, digits = 1),
adj.r.squared = round(adj.r.squared, digits = 3),
p.value = round(p.value, digits = 3),
FDR.p = round(FDR.p, digits = 3))
datatable(table_comm_wide,
caption = "Ordinary least squares regression statistics for community-wide heterotrophic production",
options = list(pageLength = 12), rownames = FALSE)
# Log10 Per-capita Production
table_percap <- bind_rows(lm_divs_particle_percap, lm_divs_free_percap) %>%
dplyr::select(-dependent) %>%
mutate(AIC = round(AIC, digits = 1),
adj.r.squared = round(adj.r.squared, digits = 3),
p.value = round(p.value, digits = 3),
FDR.p = round(FDR.p, digits = 3))
datatable(table_percap,
caption = "Ordinary least squares regression statistics for log10(per-capita production)",
options = list(pageLength = 12), rownames = FALSE)
# Here I will calculate and show the stats for each linear model,
# so later we can simply use these objects for plotting.
######################################################### COMMUNITY WIDE PRODUCTION VS DIVERSITY
######################################################### COMMUNITY WIDE PRODUCTION VS DIVERSITY
# linear model for community production vs species richness
lm_prod_vs_rich_PA <- lm(frac_bacprod ~ richness, data = filter(wide_div_meta_df, fraction == "Particle"))
summary(lm_prod_vs_rich_PA) # show stats for PA richness vs community production
##
## Call:
## lm(formula = frac_bacprod ~ richness, data = filter(wide_div_meta_df,
## fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.5089 -1.9461 -0.8788 4.1222 7.7585
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -10.288864 5.169998 -1.990 0.07461 .
## richness 0.025351 0.006185 4.099 0.00215 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.281 on 10 degrees of freedom
## Multiple R-squared: 0.6268, Adjusted R-squared: 0.5895
## F-statistic: 16.8 on 1 and 10 DF, p-value: 0.00215
lm_prod_vs_rich_FL <- lm(frac_bacprod ~ richness, data = filter(wide_div_meta_df, fraction == "Free"))
summary(lm_prod_vs_rich_FL) # show stats for FL richness vs community production
##
## Call:
## lm(formula = frac_bacprod ~ richness, data = filter(wide_div_meta_df,
## fraction == "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.085 -11.604 -3.654 7.778 33.986
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.41713 21.15292 0.398 0.699
## richness 0.02985 0.03915 0.762 0.463
##
## Residual standard error: 17.87 on 10 degrees of freedom
## Multiple R-squared: 0.05494, Adjusted R-squared: -0.03956
## F-statistic: 0.5814 on 1 and 10 DF, p-value: 0.4634
# linear model for community production vs exp(H')
lm_prod_vs_shannon_PA <- lm(frac_bacprod ~ shannon, data = filter(wide_div_meta_df, fraction == "Particle"))
summary(lm_prod_vs_shannon_PA) # show stats for PA shannon vs community production
##
## Call:
## lm(formula = frac_bacprod ~ shannon, data = filter(wide_div_meta_df,
## fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.072 -2.513 -0.871 2.511 11.983
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.63496 3.07675 0.206 0.84064
## shannon 0.07952 0.02215 3.590 0.00493 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.715 on 10 degrees of freedom
## Multiple R-squared: 0.5631, Adjusted R-squared: 0.5194
## F-statistic: 12.89 on 1 and 10 DF, p-value: 0.00493
lm_prod_vs_shannon_FL <- lm(frac_bacprod ~ shannon, data = filter(wide_div_meta_df, fraction == "Free"))
summary(lm_prod_vs_shannon_FL) # show stats for Fl shannon vs community production
##
## Call:
## lm(formula = frac_bacprod ~ shannon, data = filter(wide_div_meta_df,
## fraction == "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.447 -11.763 -3.907 6.671 37.518
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.3526 18.0841 0.904 0.387
## shannon 0.1318 0.2960 0.445 0.666
##
## Residual standard error: 18.21 on 10 degrees of freedom
## Multiple R-squared: 0.01945, Adjusted R-squared: -0.07861
## F-statistic: 0.1983 on 1 and 10 DF, p-value: 0.6656
# linear model for community production vs inverse simpson
lm_prod_vs_invsimps_PA <- lm(frac_bacprod ~ simpson, data = filter(wide_div_meta_df, fraction == "Particle"))
summary(lm_prod_vs_invsimps_PA) # show stats for PA simpson vs community production
##
## Call:
## lm(formula = frac_bacprod ~ simpson, data = filter(wide_div_meta_df,
## fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.5237 -2.1046 -0.1732 0.8787 7.7518
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.39792 2.41547 -0.165 0.872433
## simpson 0.28978 0.05672 5.109 0.000458 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.55 on 10 degrees of freedom
## Multiple R-squared: 0.723, Adjusted R-squared: 0.6953
## F-statistic: 26.1 on 1 and 10 DF, p-value: 0.000458
lm_prod_vs_invsimps_FL <- lm(frac_bacprod ~ simpson, data = filter(wide_div_meta_df, fraction == "Free"))
summary(lm_prod_vs_invsimps_FL) # show stats for FL simpson vs community production
##
## Call:
## lm(formula = frac_bacprod ~ simpson, data = filter(wide_div_meta_df,
## fraction == "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.737 -9.536 -4.360 6.152 35.830
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.8132 16.5860 0.652 0.529
## simpson 0.5476 0.6521 0.840 0.421
##
## Residual standard error: 17.77 on 10 degrees of freedom
## Multiple R-squared: 0.06587, Adjusted R-squared: -0.02754
## F-statistic: 0.7052 on 1 and 10 DF, p-value: 0.4207
######################## PHYLOGENETIC HILL DIVERSITY METRICS ################################
# 0D: linear model for community production vs phylogenetic richness
lm_prod_vs_phylorich_PA <- lm(frac_bacprod ~ phylo_richness, data = filter(wide_div_meta_df, fraction == "Particle"))
summary(lm_prod_vs_phylorich_PA) # show stats for PA phylogenetic richness vs community production
##
## Call:
## lm(formula = frac_bacprod ~ phylo_richness, data = filter(wide_div_meta_df,
## fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.149 -2.542 -0.352 3.800 8.312
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -16.84405 6.89756 -2.442 0.03473 *
## phylo_richness 0.26781 0.06716 3.988 0.00257 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.372 on 10 degrees of freedom
## Multiple R-squared: 0.6139, Adjusted R-squared: 0.5753
## F-statistic: 15.9 on 1 and 10 DF, p-value: 0.002568
lm_prod_vs_phylorich_FL <- lm(frac_bacprod ~ phylo_richness, data = filter(wide_div_meta_df, fraction == "Free"))
summary(lm_prod_vs_phylorich_FL) # show stats for FL phylogenetic richness vs community production
##
## Call:
## lm(formula = frac_bacprod ~ phylo_richness, data = filter(wide_div_meta_df,
## fraction == "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.425 -12.206 -4.266 7.672 35.243
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.2168 23.4687 0.393 0.703
## phylo_richness 0.1977 0.3048 0.649 0.531
##
## Residual standard error: 18.01 on 10 degrees of freedom
## Multiple R-squared: 0.04036, Adjusted R-squared: -0.0556
## F-statistic: 0.4206 on 1 and 10 DF, p-value: 0.5313
# 1D: linear model for community production vs phylogenetic exp(H')
lm_prod_vs_phyloshan_PA <- lm(frac_bacprod ~ phylo_shannon, data = filter(wide_div_meta_df, fraction == "Particle"))
summary(lm_prod_vs_phyloshan_PA) # show stats for PA phylogenetic shannon vs community production
##
## Call:
## lm(formula = frac_bacprod ~ phylo_shannon, data = filter(wide_div_meta_df,
## fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.6937 -4.1718 -0.8138 1.9023 17.8593
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.1167 7.4666 -0.819 0.4318
## phylo_shannon 1.7957 0.8024 2.238 0.0492 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.057 on 10 degrees of freedom
## Multiple R-squared: 0.3337, Adjusted R-squared: 0.2671
## F-statistic: 5.008 on 1 and 10 DF, p-value: 0.04918
lm_prod_vs_phyloshan_FL <- lm(frac_bacprod ~ phylo_shannon, data = filter(wide_div_meta_df, fraction == "Free"))
summary(lm_prod_vs_phyloshan_FL) # show stats for Fl phylogenetic shannon vs community production
##
## Call:
## lm(formula = frac_bacprod ~ phylo_shannon, data = filter(wide_div_meta_df,
## fraction == "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.963 -13.212 -1.868 6.471 39.452
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.2777 25.5533 0.911 0.384
## phylo_shannon 0.1225 3.9235 0.031 0.976
##
## Residual standard error: 18.39 on 10 degrees of freedom
## Multiple R-squared: 9.753e-05, Adjusted R-squared: -0.09989
## F-statistic: 0.0009754 on 1 and 10 DF, p-value: 0.9757
# 2D: linear model for community production vs phylogenetic inverse simpson
lm_prod_vs_phylosimps_PA <- lm(frac_bacprod ~ phylo_simpson, data = filter(wide_div_meta_df, fraction == "Particle"))
summary(lm_prod_vs_phylosimps_PA) # show stats for PA phylogeneticsimpson vs community production
##
## Call:
## lm(formula = frac_bacprod ~ phylo_simpson, data = filter(wide_div_meta_df,
## fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.0325 -3.5603 -0.8954 1.8803 19.6664
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.938 10.655 -0.745 0.473
## phylo_simpson 4.340 2.529 1.716 0.117
##
## Residual standard error: 7.598 on 10 degrees of freedom
## Multiple R-squared: 0.2276, Adjusted R-squared: 0.1503
## F-statistic: 2.946 on 1 and 10 DF, p-value: 0.1168
lm_prod_vs_phylosimps_FL <- lm(frac_bacprod ~ phylo_simpson, data = filter(wide_div_meta_df, fraction == "Free"))
summary(lm_prod_vs_phylosimps_FL) # show stats for FL phylogenetic simpson vs community production
##
## Call:
## lm(formula = frac_bacprod ~ phylo_simpson, data = filter(wide_div_meta_df,
## fraction == "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.206 -8.193 -3.880 4.975 36.382
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.338 21.958 0.289 0.779
## phylo_simpson 6.262 7.544 0.830 0.426
##
## Residual standard error: 17.78 on 10 degrees of freedom
## Multiple R-squared: 0.06445, Adjusted R-squared: -0.0291
## F-statistic: 0.6889 on 1 and 10 DF, p-value: 0.4259
######################################################### PER CAPITA PRODUCTION VS DIVERSITY
######################################################### PER CAPITA PRODUCTION VS DIVERSITY
################
# linear model for community production vs species richness
lm_percell_prod_vs_rich_PA <- lm(log10(fracprod_per_cell_noinf) ~ richness,
data = filter(wide_div_meta_df, fraction == "Particle"));
summary(lm_percell_prod_vs_rich_PA) # show stats for PA richness vs percell production
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ richness, data = filter(wide_div_meta_df,
## fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.69547 -0.18868 0.00469 0.25891 0.43373
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.9719413 0.3519522 -22.651 3.02e-09 ***
## richness 0.0015438 0.0004157 3.714 0.00481 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3526 on 9 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.6052, Adjusted R-squared: 0.5613
## F-statistic: 13.79 on 1 and 9 DF, p-value: 0.004814
lm_percell_prod_vs_rich_FL <- lm(log10(fracprod_per_cell_noinf) ~ richness,
data = filter(wide_div_meta_df, fraction == "Free"));
summary(lm_percell_prod_vs_rich_FL) # show stats for FL richness vs percell production
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ richness, data = filter(wide_div_meta_df,
## fraction == "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.69596 -0.10098 0.06691 0.22207 0.55462
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.1445195 0.4530938 -17.975 6.08e-09 ***
## richness 0.0010870 0.0008387 1.296 0.224
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3829 on 10 degrees of freedom
## Multiple R-squared: 0.1438, Adjusted R-squared: 0.05819
## F-statistic: 1.68 on 1 and 10 DF, p-value: 0.2241
# linear model for community production vs exp(H')
lm_percell_prod_vs_shannon_PA <- lm(log10(fracprod_per_cell_noinf) ~ shannon,
data = filter(wide_div_meta_df, fraction == "Particle"))
summary(lm_percell_prod_vs_shannon_PA) # show stats for PA shannon vs percell production
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ shannon, data = filter(wide_div_meta_df,
## fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.40181 -0.18618 -0.07638 0.08795 0.70036
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.326221 0.192332 -38.092 2.94e-11 ***
## shannon 0.005092 0.001362 3.739 0.00463 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3511 on 9 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.6084, Adjusted R-squared: 0.5649
## F-statistic: 13.98 on 1 and 9 DF, p-value: 0.004631
lm_percell_prod_vs_shannon_FL <- lm(log10(fracprod_per_cell_noinf) ~ shannon,
data = filter(wide_div_meta_df, fraction == "Free"))
summary(lm_percell_prod_vs_shannon_FL) # show stats for FL shannon vs percell production
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ shannon, data = filter(wide_div_meta_df,
## fraction == "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.71198 -0.19486 0.06765 0.14506 0.67898
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.871742 0.399113 -19.723 2.46e-09 ***
## shannon 0.005075 0.006533 0.777 0.455
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4018 on 10 degrees of freedom
## Multiple R-squared: 0.05693, Adjusted R-squared: -0.03738
## F-statistic: 0.6036 on 1 and 10 DF, p-value: 0.4552
# linear model for community production vs inverse simpson
lm_percell_prod_vs_invsimps_PA <- lm(log10(fracprod_per_cell_noinf) ~ simpson,
data = filter(wide_div_meta_df, fraction == "Particle"))
summary(lm_percell_prod_vs_invsimps_PA) # show stats for PA simpson vs percell production
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ simpson, data = filter(wide_div_meta_df,
## fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.28621 -0.18272 -0.11287 0.07464 0.56371
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.357307 0.158221 -46.500 4.92e-12 ***
## simpson 0.017852 0.003694 4.833 0.00093 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2959 on 9 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.7219, Adjusted R-squared: 0.691
## F-statistic: 23.36 on 1 and 9 DF, p-value: 0.00093
lm_percell_prod_vs_invsimps_FL <- lm(log10(fracprod_per_cell_noinf) ~ simpson,
data = filter(wide_div_meta_df, fraction == "Free"))
summary(lm_percell_prod_vs_invsimps_FL) # show stats for FL simpson vs percell production
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ simpson, data = filter(wide_div_meta_df,
## fraction == "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.68128 -0.11210 0.01117 0.17801 0.62672
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.03955 0.35395 -22.71 6.17e-10 ***
## simpson 0.01920 0.01392 1.38 0.198
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3792 on 10 degrees of freedom
## Multiple R-squared: 0.16, Adjusted R-squared: 0.07597
## F-statistic: 1.904 on 1 and 10 DF, p-value: 0.1977
######################## PHYLOGENETIC HILL DIVERSITY METRICS ################################
# linear model for community production vs phylogenetic richness
lm_percell_prod_vs_phylorich_PA <- lm(log10(fracprod_per_cell_noinf) ~ phylo_richness,
data = filter(wide_div_meta_df, fraction == "Particle"));
summary(lm_percell_prod_vs_phylorich_PA) # show stats for PA phylogeneticrichness vs percell production
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ phylo_richness,
## data = filter(wide_div_meta_df, fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.67460 -0.19919 -0.00661 0.29035 0.44116
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.375458 0.464552 -18.03 2.26e-08 ***
## phylo_richness 0.016357 0.004482 3.65 0.00532 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3563 on 9 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.5968, Adjusted R-squared: 0.552
## F-statistic: 13.32 on 1 and 9 DF, p-value: 0.005318
lm_percell_prod_vs_phylorich_FL <- lm(log10(fracprod_per_cell_noinf) ~ phylo_richness,
data = filter(wide_div_meta_df, fraction == "Free"));
summary(lm_percell_prod_vs_phylorich_FL) # show stats for FL phylogenetic richness vs percell production
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ phylo_richness,
## data = filter(wide_div_meta_df, fraction == "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.68143 -0.10689 0.05126 0.23501 0.57378
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.207228 0.498634 -16.46 1.43e-08 ***
## phylo_richness 0.008420 0.006476 1.30 0.223
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3827 on 10 degrees of freedom
## Multiple R-squared: 0.1446, Adjusted R-squared: 0.05905
## F-statistic: 1.69 on 1 and 10 DF, p-value: 0.2227
# linear model for community production vs phylogenetic exp(H')
lm_percell_prod_vs_phyloshan_PA <- lm(log10(fracprod_per_cell_noinf) ~ phylo_shannon,
data = filter(wide_div_meta_df, fraction == "Particle"))
summary(lm_percell_prod_vs_phyloshan_PA) # show stats for PA phylogeneticshannon vs percell production
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ phylo_shannon,
## data = filter(wide_div_meta_df, fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.46570 -0.19804 -0.09387 -0.01228 1.04909
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.81526 0.45758 -17.080 3.64e-08 ***
## phylo_shannon 0.12300 0.04952 2.484 0.0348 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4322 on 9 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.4067, Adjusted R-squared: 0.3408
## F-statistic: 6.169 on 1 and 9 DF, p-value: 0.03478
lm_percell_prod_vs_phyloshan_FL <- lm(log10(fracprod_per_cell_noinf) ~ phylo_shannon,
data = filter(wide_div_meta_df, fraction == "Free"))
summary(lm_percell_prod_vs_phyloshan_FL) # show stats for FL phylogenetic shannon vs percell production
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ phylo_shannon,
## data = filter(wide_div_meta_df, fraction == "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.66194 -0.28222 0.05688 0.20007 0.75034
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.632289 0.574784 -13.279 1.12e-07 ***
## phylo_shannon 0.008985 0.088253 0.102 0.921
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4136 on 10 degrees of freedom
## Multiple R-squared: 0.001036, Adjusted R-squared: -0.09886
## F-statistic: 0.01037 on 1 and 10 DF, p-value: 0.9209
# linear model for community production vs inverse simpson
lm_percell_prod_vs_phylosimps_PA <- lm(log10(fracprod_per_cell_noinf) ~ phylo_simpson,
data = filter(wide_div_meta_df, fraction == "Particle"))
summary(lm_percell_prod_vs_phylosimps_PA) # show stats for PA phylogeneticsimpson vs percell production
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ phylo_simpson,
## data = filter(wide_div_meta_df, fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.59626 -0.18652 -0.08578 -0.02009 1.15951
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.0385 0.6598 -12.182 6.77e-07 ***
## phylo_simpson 0.3230 0.1587 2.036 0.0723 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4643 on 9 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.3153, Adjusted R-squared: 0.2392
## F-statistic: 4.144 on 1 and 9 DF, p-value: 0.07227
lm_percell_prod_vs_phylosimps_FL <- lm(log10(fracprod_per_cell_noinf) ~ phylo_simpson,
data = filter(wide_div_meta_df, fraction == "Free"))
summary(lm_percell_prod_vs_phylosimps_FL) # show stats for FL phylogenetic simpson vs percell production
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ phylo_simpson,
## data = filter(wide_div_meta_df, fraction == "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.68749 -0.13428 0.02948 0.15692 0.66844
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.0711 0.4847 -16.650 1.28e-08 ***
## phylo_simpson 0.1753 0.1665 1.053 0.317
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3926 on 10 degrees of freedom
## Multiple R-squared: 0.09974, Adjusted R-squared: 0.009714
## F-statistic: 1.108 on 1 and 10 DF, p-value: 0.3173
Figure 3
Prepare Figure 3
######################################################### OBSERVED RICHNESS #########################################################
rich_fraction_wilcox <- wilcox.test(richness ~ fraction, data = wide_div_meta_df)
rich_fraction_wilcox
##
## Wilcoxon rank sum test
##
## data: richness by fraction
## W = 125, p-value = 0.001433
## alternative hypothesis: true location shift is not equal to 0
wide_div_meta_df %>%
group_by(fraction) %>%
summarize(mean(richness), sd(richness), min(richness), max(richness))
## # A tibble: 2 x 5
## fraction `mean(richness)` `sd(richness)` `min(richness)` `max(richness)`
## <fct> <dbl> <dbl> <int> <int>
## 1 Particle 799. 257. 468 1394
## 2 Free 524. 138. 326 767
# Make a data frame to draw significance line in boxplot (visually calculated)
rich_nums <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(1350, 1400, 1400, 1350)) # WholePart vs WholeFree
rich_distribution_plot <-
wide_div_meta_df %>%
ggplot(aes(y = richness, x = fraction)) +
scale_fill_manual(values = fraction_colors) +
geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) +
geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
scale_y_continuous(limits = c(150,1500), breaks = seq(from = 0, to =1500, by = 300)) +
labs(y = expression({}^0*italic(D)),
x = "Fraction") +
scale_shape_manual(values = season_shapes) +
geom_path(data = rich_nums, aes(x = a, y = b), linetype = 1, color = "#424645") +
annotate("text", x=1.5, y=1500, size = 8, color = "#424645", label= paste("***"), angle = 90) +
annotate("text", x=1.5, y=1150, size = 4, color = "#424645",
label= paste("p =", round(rich_fraction_wilcox$p.value, digits = 3))) +
theme(legend.position = "none",# axis.title.y = element_blank(),
axis.title.x = element_blank(), axis.text.x = element_blank()) +
coord_flip()
################ Richness vs Community-wide (Per-Liter) Production
## 1. Extract the R2 and p-value from the linear model:
lm_lab_perliter_rich_PA <- paste("atop(R^2 ==", round(summary(lm_prod_vs_rich_PA)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(lm_prod_vs_rich_PA)$coefficients[,4][2]), digits = 3), ")")
## 2. Plot Richness vs Community-wide (Per-Liter) Production
prod_vs_rich_plot <-
wide_div_meta_df %>%
# Fetch the standard error from diversity measure to plot error bars
left_join(filter(div_iNEXT, Diversity == "Species richness"), by = "norep_filter_name") %>%
rename(se_iNEXT ="s.e.") %>%
ggplot(aes(x=richness, y=frac_bacprod)) +
geom_errorbarh(aes(xmin = richness - se_iNEXT, xmax = richness + se_iNEXT,
color = fraction), alpha = 0.7) + # X-axis errorbars
geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod,
color = fraction)) + # Y-axis errorbars
geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
scale_color_manual(values = fraction_colors) +
scale_fill_manual(values = fraction_colors) +
scale_shape_manual(values = season_shapes) +
labs(x = expression({}^0*italic(D)),
y = "\n Community Production \n (μgC/L/day)") +
geom_smooth(data=filter(wide_div_meta_df, fraction == "Particle"),
method='lm', color = "#FF6600", fill = "#FF6600") +
scale_x_continuous(limits = c(150,1500), breaks = seq(from = 0, to =1500, by = 300)) +
scale_y_continuous(limits = c(-5, 75), breaks = seq(0, 75, by = 15)) +
# Add the R2 and p-value to the plot
annotate("text", x=1200, y=50, label=lm_lab_perliter_rich_PA, parse = TRUE, color = "#FF6600", size = 4) +
theme(legend.position = "none") # axis.title.x = element_blank(), axis.text.x = element_blank()
# Summary stats
summary(lm(frac_bacprod ~ richness, data = filter(wide_div_meta_df, fraction == "Particle")))
##
## Call:
## lm(formula = frac_bacprod ~ richness, data = filter(wide_div_meta_df,
## fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.5089 -1.9461 -0.8788 4.1222 7.7585
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -10.288864 5.169998 -1.990 0.07461 .
## richness 0.025351 0.006185 4.099 0.00215 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.281 on 10 degrees of freedom
## Multiple R-squared: 0.6268, Adjusted R-squared: 0.5895
## F-statistic: 16.8 on 1 and 10 DF, p-value: 0.00215
# WITHOUT THE TWO HIGH POINTS
summary(lm(frac_bacprod ~ richness, data = filter(wide_div_meta_df, fraction == "Particle" & richness < 1000)))
##
## Call:
## lm(formula = frac_bacprod ~ richness, data = filter(wide_div_meta_df,
## fraction == "Particle" & richness < 1000))
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.152 -3.181 -1.600 3.647 8.554
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.404104 8.621046 0.279 0.787
## richness 0.006798 0.012051 0.564 0.588
##
## Residual standard error: 4.844 on 8 degrees of freedom
## Multiple R-squared: 0.03826, Adjusted R-squared: -0.08196
## F-statistic: 0.3182 on 1 and 8 DF, p-value: 0.5881
################ Richness vs Per Capita (Per-Cell) Production
## 1. Extract the R2 and p-value from the linear model:
lm_lab_percell_rich_PA <- paste("atop(R^2 ==", round(summary(lm_percell_prod_vs_rich_PA)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(lm_percell_prod_vs_rich_PA)$coefficients[,4][2]), digits = 3), ")")
## 2. Plot Richness vs Per Capita (Per-Cell) Production
percell_prod_vs_rich_plot <-
wide_div_meta_df %>%
# Fetch the standard error from diversity measure to plot error bars
left_join(filter(div_iNEXT, Diversity == "Species richness"), by = "norep_filter_name") %>%
rename(se_iNEXT ="s.e.") %>%
ggplot(aes(x=richness, y=log10(fracprod_per_cell_noinf))) +
geom_errorbarh(aes(xmin = richness - se_iNEXT, xmax = richness + se_iNEXT,
color = fraction), alpha = 0.7) + # X-axis errorbars
geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
scale_color_manual(values = fraction_colors) +
scale_fill_manual(values = fraction_colors) +
scale_shape_manual(values = season_shapes) +
labs(x = expression({}^0*italic(D)),
y = "\n log10(Per-Capita Production)\n (μgC/cell/day)") +
geom_smooth(data=filter(wide_div_meta_df, fraction == "Particle"),
method='lm', color = "#FF6600", fill = "#FF6600") +
scale_x_continuous(limits = c(150,1500), breaks = seq(from = 0, to =1500, by = 300)) +
scale_y_continuous(limits = c(-8.5,-5), breaks = seq(from = -8, to =-5, by = 1)) +
#scale_y_continuous(limits = c(-8e-7,5e-6), breaks = seq(from = 0, to = 6e-7, by = 3e-7)) +
# Add the R2 and p-value to the plot
annotate("text", x=1200, y=-7.5, label=lm_lab_percell_rich_PA, parse = TRUE, color = "#FF6600", size = 4) +
theme(legend.title = element_blank(), legend.position ="bottom",
legend.text = element_text(size = 14))
# Summary stats
summary(lm(log10(fracprod_per_cell_noinf) ~ richness, data = filter(wide_div_meta_df, fraction == "Particle")))
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ richness, data = filter(wide_div_meta_df,
## fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.69547 -0.18868 0.00469 0.25891 0.43373
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.9719413 0.3519522 -22.651 3.02e-09 ***
## richness 0.0015438 0.0004157 3.714 0.00481 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3526 on 9 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.6052, Adjusted R-squared: 0.5613
## F-statistic: 13.79 on 1 and 9 DF, p-value: 0.004814
# WITHOUT THE TWO HIGH POINTS
summary(lm(log10(fracprod_per_cell_noinf) ~ richness, data = filter(wide_div_meta_df, fraction == "Particle" & richness < 1000)))
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ richness, data = filter(wide_div_meta_df,
## fraction == "Particle" & richness < 1000))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.32289 -0.25690 -0.00076 0.25072 0.37097
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.911e+00 5.157e-01 -13.401 3.02e-06 ***
## richness -2.318e-05 7.197e-04 -0.032 0.975
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2893 on 7 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.0001482, Adjusted R-squared: -0.1427
## F-statistic: 0.001037 on 1 and 7 DF, p-value: 0.9752
######################################################### 2D = SIMPSON DIVERSITY (INVERSE SIMPSON)
######################################################### 2D = SIMPSON DIVERSITY (INVERSE SIMPSON)
invsimps_fraction_wilcox <- wilcox.test(simpson ~ fraction, data = wide_div_meta_df)
invsimps_fraction_wilcox
##
## Wilcoxon rank sum test
##
## data: simpson by fraction
## W = 80, p-value = 0.6707
## alternative hypothesis: true location shift is not equal to 0
wide_div_meta_df %>%
group_by(fraction) %>%
summarize(mean(simpson), sd(simpson), min(simpson), max(simpson))
## # A tibble: 2 x 5
## fraction `mean(simpson)` `sd(simpson)` `min(simpson)` `max(simpson)`
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Particle 35.7 24.2 12.6 80.2
## 2 Free 24.2 8.22 12.2 43.4
# Make a data frame to draw significance line in boxplot (visually calculated)
invsimps_nums <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(83,85,85,83)) # WholePart vs WholeFree
invsimps_distribution_plot <-
wide_div_meta_df %>%
ggplot(aes(y = simpson, x = fraction)) +
scale_fill_manual(values = fraction_colors) +
scale_shape_manual(values = season_shapes) +
geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) +
geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
scale_y_continuous(limits = c(0,85), breaks = seq(from = 0, to = 85, by = 20)) +
labs(y = expression({}^2*italic(D)),
x = "Fraction") +
geom_path(data = invsimps_nums, aes(x = a, y = b), linetype = 1, color = "#424645") +
annotate("text", x=1.5, y=80, size = 4, color = "#424645", label= "NS") +
theme(legend.position = "none", #axis.title.y = element_blank(),
axis.title.x = element_blank(), axis.text.x = element_blank()) +
coord_flip()
################ Inverse Simpson vs Community-wide (Per-Liter) Production
## 1. Extract the R2 and p-value from the linear model:
lm_lab_perliter_invsimps_PA <- paste("atop(R^2 ==", round(summary(lm_prod_vs_invsimps_PA)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(lm_prod_vs_invsimps_PA)$coefficients[,4][2]), digits = 4), ")")
## 2. Plot Inverse Simpson vs Community-wide (Per-Liter) Production
prod_vs_invsimps_plot <-
wide_div_meta_df %>%
# Fetch the standard error from diversity measure to plot error bars
left_join(filter(div_iNEXT, Diversity == "Simpson diversity"), by = "norep_filter_name") %>%
rename(se_iNEXT ="s.e.") %>%
ggplot(aes(x=simpson, y=frac_bacprod)) +
geom_errorbarh(aes(xmin = simpson - se_iNEXT, xmax = simpson + se_iNEXT,
color = fraction), alpha = 0.7) + # X-axis errorbars
geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction)) + # Y-axis errorbars
geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
scale_color_manual(values = fraction_colors) +
scale_fill_manual(values = fraction_colors) +
scale_shape_manual(values = season_shapes) +
labs(x = expression({}^2*italic(D)),
y = "Community Production \n (μgC/L/day)") +
geom_smooth(data=filter(wide_div_meta_df, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") +
scale_x_continuous(limits = c(0,85), breaks = seq(from = 0, to = 85, by = 20)) +
scale_y_continuous(limits = c(-5, 75), breaks = seq(0, 75, by = 15)) +
# Add the R2 and p-value to the plot
annotate("text", x=70, y=45, label=lm_lab_perliter_invsimps_PA, parse = TRUE, color = "#FF6600", size = 4) +
theme(legend.position = "none") #axis.title.x = element_blank(), axis.text.x = element_blank()
# Summary stats
summary(lm(frac_bacprod ~ simpson, data = filter(wide_div_meta_df, fraction == "Particle")))
##
## Call:
## lm(formula = frac_bacprod ~ simpson, data = filter(wide_div_meta_df,
## fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.5237 -2.1046 -0.1732 0.8787 7.7518
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.39792 2.41547 -0.165 0.872433
## simpson 0.28978 0.05672 5.109 0.000458 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.55 on 10 degrees of freedom
## Multiple R-squared: 0.723, Adjusted R-squared: 0.6953
## F-statistic: 26.1 on 1 and 10 DF, p-value: 0.000458
# WITHOUT THE TWO HIGH POINTS
summary(lm(frac_bacprod ~ simpson, data = filter(wide_div_meta_df, fraction == "Particle" & simpson < 70)))
##
## Call:
## lm(formula = frac_bacprod ~ simpson, data = filter(wide_div_meta_df,
## fraction == "Particle" & simpson < 70))
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.3147 -1.6362 -0.3721 1.4970 6.5303
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.64917 2.47998 0.665 0.5248
## simpson 0.20339 0.08038 2.530 0.0352 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.681 on 8 degrees of freedom
## Multiple R-squared: 0.4445, Adjusted R-squared: 0.3751
## F-statistic: 6.402 on 1 and 8 DF, p-value: 0.03524
################ Inverse Simpson vs Per Capitra (Per-Cell) Production
## 1. Extract the R2 and p-value from the linear model:
lm_lab_percell_invsimps_PA <- paste("atop(R^2 ==", round(summary(lm_percell_prod_vs_invsimps_PA)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(lm_percell_prod_vs_invsimps_PA)$coefficients[,4][2]), digits = 4), ")")
## 2. Plot Inverse Simpson vs Per Capitra (Per-Cell) Production
percell_prod_vs_invsimps_plot <-
wide_div_meta_df %>%
# Fetch the standard error from diversity measure to plot error bars
left_join(filter(div_iNEXT, Diversity == "Simpson diversity"), by = "norep_filter_name") %>%
rename(se_iNEXT ="s.e.") %>%
ggplot(aes(x=simpson, y=log10(fracprod_per_cell_noinf))) +
geom_errorbarh(aes(xmin = simpson - se_iNEXT, xmax = simpson + se_iNEXT,
color = fraction), alpha = 0.7) + # X-axis errorbars
geom_point(size = 3.5, color = "black", aes(shape = season, fill = fraction)) +
scale_color_manual(values = fraction_colors, guide = TRUE) +
scale_fill_manual(values = fraction_colors, guide = FALSE) +
scale_shape_manual(values = season_shapes) +
labs(x = expression({}^2*italic(D)),
y = "log10(production/cell)\n (μgC/cell/day)") +
geom_smooth(data=filter(wide_div_meta_df, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") +
scale_x_continuous(limits = c(0,85), breaks = seq(from = 0, to = 85, by = 20)) +
scale_y_continuous(limits = c(-8.5,-5), breaks = seq(from = -8, to =-5, by = 1)) +
# Add the R2 and p-value to the plot
annotate("text", x=70, y=-7.5, label=lm_lab_percell_invsimps_PA, parse = TRUE, color = "#FF6600", size = 4) +
guides(color = guide_legend(override.aes = list(shape = 15))) +
theme(legend.title = element_blank(), legend.position ="bottom",
legend.text = element_text(size = 14))
# Summary stats
summary(lm(log10(fracprod_per_cell_noinf) ~ simpson, data = filter(wide_div_meta_df, fraction == "Particle")))
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ simpson, data = filter(wide_div_meta_df,
## fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.28621 -0.18272 -0.11287 0.07464 0.56371
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.357307 0.158221 -46.500 4.92e-12 ***
## simpson 0.017852 0.003694 4.833 0.00093 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2959 on 9 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.7219, Adjusted R-squared: 0.691
## F-statistic: 23.36 on 1 and 9 DF, p-value: 0.00093
# WITHOUT THE TWO HIGH POINTS
summary(lm(log10(fracprod_per_cell_noinf) ~ simpson, data = filter(wide_div_meta_df, fraction == "Particle" & simpson < 70)))
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ simpson, data = filter(wide_div_meta_df,
## fraction == "Particle" & simpson < 70))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.23956 -0.19011 -0.03463 0.07002 0.49075
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.172281 0.164525 -43.59 8.73e-10 ***
## simpson 0.009474 0.005539 1.71 0.131
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2429 on 7 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2947, Adjusted R-squared: 0.194
## F-statistic: 2.925 on 1 and 7 DF, p-value: 0.1309
# Plot Inverse simpson plots altogether
invsimps_plots <- plot_grid(invsimps_distribution_plot, prod_vs_invsimps_plot, percell_prod_vs_invsimps_plot,
labels = c("B", "D", "F"), ncol = 1, nrow = 3,
rel_heights = c(0.5, 0.8, 1.1),
align = "v")
Plot Figure 3
# All plots together
fig3_top <- plot_grid(prod_vs_rich_plot,
prod_vs_invsimps_plot,
percell_prod_vs_rich_plot + theme(legend.position = "none"),
percell_prod_vs_invsimps_plot + theme(legend.position = "none"),
labels = c("A", "B", "C", "D"), ncol = 2, nrow = 2,
rel_heights = c(1, 1), rel_widths = c(1,1),
align = "hv")
# add legend
plot_grid(fig3_top, season_legend,
ncol = 1, nrow = 2,
rel_heights = c(1, 0.05))
Figure 4
Phylodiv vs ses.MPD
# Is there a relationship between phylogenetic richness (0D) and the unweighted mean pairwise distance?
# 1. Since it's a general correlation - the linear model will be for
lm_unweightMPD_phylorich <- lm(unweighted_sesmpd ~ phylo_richness, data = wide_div_meta_df)
## 2. Extract the R2 and p-value from the linear model:
lm_lab_unweightMPD_phylorich <- paste("atop(R^2 ==", round(summary(lm_unweightMPD_phylorich)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(lm_unweightMPD_phylorich)$coefficients[,4][2]), digits = 3), ")")
## 3. Plot
phy_MPD_p0 <- wide_div_meta_df %>%
ggplot(aes(x = phylo_richness, y = unweighted_sesmpd, fill = fraction)) +
geom_point(aes(shape = season), size = 3) +
scale_fill_manual(values = fraction_colors) +
scale_shape_manual(values = season_shapes) +
labs(x = expression({}^0*italic(PD)),
y = "Unweighted MPD") +
geom_smooth(method='lm', color = "black", fill = "grey") +
# Add the R2 and p-value to the plot
annotate("text", x=140, y=1, label=lm_lab_unweightMPD_phylorich, parse = TRUE, color = "black", size = 4) +
theme(legend.position = "none")
plot_grid(phy_MPD_p0, season_2row_legend, rel_heights = c(1, 0.08),
nrow = 2, ncol = 1)
Lasso Regressions
Prepare data for lasso
# The lasso regression needs datasets that are wide formatted
lasso_cols_remove <- c("project", "year", "Date", "limnion", "dnaconcrep1", "perc_attached_bacprod",
"SD_perc_attached_bacprod", "SE_total_bac_abund", "SE_perc_attached_abund", "SE_attached_bac",
"tot_bacprod", "SD_tot_bacprod","SD_frac_bacprod", "BGA_cellspermL", "Turb_NTU",
"lakesite", "season", "station")
### PARTICLE DATA = 12 samples
# Note: Even though in the inclusion of MOTEJ515 doesn't have the bacterial counts,
# the results from this lasso regression does not change.
lasso_data_df_particle <- wide_div_meta_df %>%
filter(fraction == "Particle" ) %>%
dplyr::select(-c(fraction, lasso_cols_remove))
lasso_data_df_particle_noprod <- lasso_data_df_particle %>%
dplyr::select(-c(fracprod_per_cell, fracprod_per_cell_noinf, norep_filter_name, norep_water_name))
### FREE DATA
# Note: Even though in the inclusion of MOTEJ515 doesn't have the bacterial counts,
# the results from this lasso regression does not change.
lasso_data_df_free <- wide_div_meta_df %>%
filter(fraction == "Free" ) %>%
dplyr::select(-c(fraction, lasso_cols_remove))
lasso_data_df_free_noprod <- lasso_data_df_free %>%
dplyr::select(-c(fracprod_per_cell, fracprod_per_cell_noinf, norep_filter_name, norep_water_name))
Run Lasso regression
Lasso - Community Production: Particle
# Set the seed for reproducibility of the grid values
set.seed(777)
################ PREPARE DATA ################
# Subset data needed and scale all o
scaled_comm_data <-
lasso_data_df_particle_noprod %>% # Use data only for the particle samples
scale() %>% # Scale all of the variables to have mean =0 and sd = 1
as.data.frame() # Make it a dataframe so that model.matrix function works (does not take a matrix)
# Set model parameters for community level data
## NOTE: there cannot be any data with NA
x = model.matrix(frac_bacprod ~ ., scaled_comm_data)[,-1]
y = scaled_comm_data$frac_bacprod
grid = 10^seq(10,-2,length = 100)
################ PREPARE TRAINING & TESTING DATA FOR CROSS VALIDATION ################
# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model
train <- sample(1:nrow(x), nrow(x)/2)
test <- -train
y_test <- y[test]
################ LASSO ################
# Run lasso regression with alpha = 1
lasso_divs_train <- glmnet(x[train,], y[train], alpha = 1, lambda = grid, standardize = FALSE)
par(mfrow = c(1,2))
plot(lasso_divs_train)
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique 'x' values
# Cross validation
cv_lasso_divs <- cv.glmnet(x[train,], y[train], alpha = 1)
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(cv_lasso_divs)
best_lasso_lambda <- cv_lasso_divs$lambda.min # Minimum lambda
# https://stats.stackexchange.com/questions/138569/why-is-lambda-plus-1-standard-error-a-recommended-value-for-lambda-in-an-elastic
lasso_lambda_1se <- cv_lasso_divs$lambda.1se # simplest model whose accuracy is comparable with the best model
best_lasso_lambda == lasso_lambda_1se
## [1] TRUE
lasso_divs_pred <- predict(lasso_divs_train, s = best_lasso_lambda, newx = x[test,])
mean((lasso_divs_pred - y_test)^2) # Test
## [1] 1.459684
## Run lasso on the entire dataset with the best lambda value
lasso_divs <- glmnet(x, y, alpha = 1, lambda = best_lasso_lambda, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(lasso_divs, type = "coefficients", s = best_lasso_lambda)
## 32 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -5.001679e-17
## richness 2.878328e-02
## shannon .
## simpson 3.049965e-01
## phylo_shannon .
## phylo_simpson .
## phylo_richness .
## unweighted_sesmpd .
## weighted_sesmpd .
## Sample_depth_m .
## Temp_C .
## SpCond_uSpercm .
## TDS_mgperL .
## pH .
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund .
## attached_bac .
## perc_attached_abund .
## fraction_bac_abund .
## PC1 .
## PC2 .
## Now, run the most simple, parsimonious lasso model within 1 standard error
## IMPORTANT NOTE: This is the lasso reported within the manuscript
lasso_divs_1se <- glmnet(x, y, alpha = 1, lambda = lasso_lambda_1se, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(lasso_divs_1se, type = "coefficients", s = lasso_lambda_1se)
## 32 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -5.001679e-17
## richness 2.878328e-02
## shannon .
## simpson 3.049965e-01
## phylo_shannon .
## phylo_simpson .
## phylo_richness .
## unweighted_sesmpd .
## weighted_sesmpd .
## Sample_depth_m .
## Temp_C .
## SpCond_uSpercm .
## TDS_mgperL .
## pH .
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund .
## attached_bac .
## perc_attached_abund .
## fraction_bac_abund .
## PC1 .
## PC2 .
Lasso - Community Production: Free
# Set the seed for reproducibility of the grid values
set.seed(777)
################ PREPARE DATA ################
# Subset data needed and scale all o
scaled_comm_data_free <-
lasso_data_df_free_noprod %>% # Use data only for the free-living samples
scale() %>% # Scale all of the variables to have mean = 0 and sd = 1
as.data.frame() # Make it a dataframe so that model.matrix function works (does not take a matrix)
# Set model parameters for community level data
## NOTE: there cannot be any data with NA
free_x <- model.matrix(frac_bacprod ~ ., scaled_comm_data_free)[,-1]
free_y <- scaled_comm_data_free$frac_bacprod
free_grid <- 10^seq(10,-2,length = 100)
################ PREPARE TRAINING & TESTING DATA FOR CROSS VALIDATION ################
# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model
free_train <- sample(1:nrow(free_x), nrow(free_x)/2)
free_test <- -free_train
free_y_test <- free_y[free_test]
################ LASSO ################
# Run lasso regression with alpha = 1
lasso_divs_train_free_comm <- glmnet(free_x[free_train,], free_y[free_train], alpha = 1, lambda = free_grid, standardize = FALSE)
par(mfrow = c(1,2))
plot(lasso_divs_train_free_comm)
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique 'x' values
# Cross validation
free_cv_lasso_divs <- cv.glmnet(free_x[free_train,], free_y[free_train], alpha = 1)
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(free_cv_lasso_divs)
free_best_lasso_lambda <- free_cv_lasso_divs$lambda.min
# https://stats.stackexchange.com/questions/138569/why-is-lambda-plus-1-standard-error-a-recommended-value-for-lambda-in-an-elastic
free_lasso_lambda_1se <- free_cv_lasso_divs$lambda.1se # simplest model whose accuracy is comparable with the best model
free_best_lasso_lambda == free_lasso_lambda_1se
## [1] FALSE
free_lasso_divs_pred <- predict(lasso_divs_train_free_comm, s = free_best_lasso_lambda, newx = free_x[free_test,])
mean((free_lasso_divs_pred - y_test)^2) # Test
## [1] 1.425752
## Run lasso on the entire dataset with the best lambda value
free_lasso_divs <- glmnet(free_x, free_y, alpha = 1, lambda = free_best_lasso_lambda, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(free_lasso_divs, type = "coefficients", s = free_best_lasso_lambda)
## 32 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -5.076392e-16
## richness .
## shannon .
## simpson .
## phylo_shannon .
## phylo_simpson .
## phylo_richness .
## unweighted_sesmpd .
## weighted_sesmpd .
## Sample_depth_m .
## Temp_C .
## SpCond_uSpercm .
## TDS_mgperL .
## pH -2.606707e-01
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund .
## attached_bac .
## perc_attached_abund .
## fraction_bac_abund .
## PC1 .
## PC2 .
## Now, run the most simple, parsimonious lasso model within 1 standard error
## IMPORTANT NOTE: This is the lasso reported within the manuscript
free_lasso_divs_1se <- glmnet(free_x, free_y, alpha = 1, lambda = free_lasso_lambda_1se, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(free_lasso_divs_1se, type = "coefficients", s = free_lasso_lambda_1se)
## 32 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -4.127014e-16
## richness .
## shannon .
## simpson .
## phylo_shannon .
## phylo_simpson .
## phylo_richness .
## unweighted_sesmpd .
## weighted_sesmpd .
## Sample_depth_m .
## Temp_C .
## SpCond_uSpercm .
## TDS_mgperL .
## pH -2.144478e-01
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund .
## attached_bac .
## perc_attached_abund .
## fraction_bac_abund .
## PC1 .
## PC2 .